//Semi-automated categorization of open-ended questions 
// example 2: disclosure
// q2:  text variable : q2
// code1 :  categorization (y)
// id, instance : Two different identifying variables

version 13.0
clear 

//install boost
//install ngram   
//e.g.  net search ngram and follow links

set scheme        s1color    
set copycolor     asis       
set memory 100m
set more off
set seed 12345678
set matsize 8000

cap log close 
cap rm boost_why.log
log using boost_why
*****************************************************
use disclosure_why
local trainn=500
******************************************************
// ensure that training data contain at least one of each code
// for sorting it is easier to code duplicatecodes
bysort  code1: gen duplicatecode= _n!=1
gen u=uniform()
//sort year_start duplicatecode u   //a complete set of codes at the beginning
sort u
*****************************************************
* Decide which categories to use
// note 80 is a new code because coding for "why" does not contain "other category"
local other_code=80
local min_occurance=10
// recode codes with less than `min_occurance' occurances in TRAINING data
foreach i of numlist 1 2 4 5 10/13 20/28 30 98 99 { 
	di "Considering code `i'"
	count in 1/`trainn' if  code1==`i'  
	local count=r(N)
	replace code1=`other_code' if (code1==`i') & (`count'<`min_occurance') 
}
tab code1
*****************************************************
// number of codes in training data
qui tab code1 in 1/`trainn'   // not all codes may be in the training data
local n_codes=r(r)
di "Number of categories in training data =`n_codes'"

local N=_N
**************************************************************************
set locale_functions en
ngram q2 ,  threshold(5) degree(2) stemmer binarize 
**************************************************************************
 // boosting
cap drop boost_pred1-boost_pred`n_codes'
profiler on
boost code1 n_token t_*  in 1/`trainn' , dist(multinomial) train(0.8) maxiter(3000) bag(.5) interaction(5) shrink(0.1) pred("boost_pred") influence seed(1)
profiler off 
profiler report
profiler clear
ereturn list
***************************************************************************
* influence
matrix influence=e(influence)
svmat influence
gen id2=_n
replace id2=. if influence1==.
//graph bar (mean) influence1, over(id2) ytitle(Percentage Influence) 

* number of variables with any influence 
* for influence each variable appears in a row (converted from influence_mat)
egen influence_sum= rowtotal(influence1-influence`n_codes')
count if influence_sum>0 & influence_sum!=.
egen influence_max= rowmax(influence1-influence`n_codes')

// names of influential variables
local names : rownames influence
gen names=""
forvalues i=1/`: word count `names'' {
  qui replace names=`"`: word `i' of `names''"' in `i'
}
************************************************************************
// compute pred_max, predcat , correct
egen pred_max= rowmax(boost_pred1-boost_pred`n_codes')
gen predcat = .

foreach var of varlist boost_pred1-boost_pred`n_codes' {
	// boosting prediction var contains category name
	replace predcat=`: var label `var'' if pred_max==`var' & pred_max!=.  
}
tab code1 predcat, row

* compute correct predictions 
*( svmat has expanded the data set with additional rows; restrict to 1/N)
gen correct=.
replace correct= code1==predcat in 1/`N'
tab correct in 1/`trainn'
tab correct in `trainn'/`N'
tab code1 correct in `trainn'/`N', row   // prediction accuracy of categories
*************************************************************************
* accuracy
* svmat may expand the number of observations; do not redefine `N'
cap program drop accuracy
do expected_accuracy.do
accuracy pred_max code1 predcat in `trainn'/`N', p_man(.8) 
**************dis***************************************************************
save disclosure_why_merged_boost, replace
cap log close 

****************************************************************************

